Putative TFs are determined by genes with GO terms
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Fs_LRT.1000 <-
readr::read_csv("data/Fs_LRT_1000.csv")
## Rows: 1000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Fd_LRT.1000 <-
readr::read_csv("data/Fd_LRT_1000.csv")
## Rows: 1000 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Non-transformed
Fd_PES <-
readr::read_csv(file = "data/Fd_PES.csv")
## Rows: 7907 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (9): PS, gamete, E1, E2, E3, E4, E5, E6, matSP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fd_PES_F <-
# readr::read_csv(file = "data/Fd_PES_F.csv")
# Fd_PES_M <-
# readr::read_csv(file = "data/Fd_PES_M.csv")
Fs_PES <-
readr::read_csv(file = "data/Fs_PES.csv")
## Rows: 8291 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (8): PS, gamete, 24H, 48H, 1w, 3w, 4w, matSP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fs_PES_F <-
# readr::read_csv(file = "data/Fs_PES_F.csv")
# Fs_PES_M <-
# readr::read_csv(file = "data/Fs_PES_M.csv")
Ec_PES_32m <-
readr::read_csv(file = "data/Ec_PES_32m.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Ec_PES_25f <-
readr::read_csv(file = "data/Ec_PES_25f.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sqrt-tranformed
Fd_PES.sqrt <-
readr::read_csv(file = "data/Fd_PES.sqrt.csv")
## Rows: 7907 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (9): PS, gamete, E1, E2, E3, E4, E5, E6, matSP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fd_PES_F.sqrt <-
# readr::read_csv(file = "data/Fd_PES_F.sqrt.csv")
# Fd_PES_M.sqrt <-
# readr::read_csv(file = "data/Fd_PES_M.sqrt.csv")
Fs_PES.sqrt <-
readr::read_csv(file = "data/Fs_PES.sqrt.csv")
## Rows: 8291 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (8): PS, gamete, 24H, 48H, 1w, 3w, 4w, matSP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fs_PES_F.sqrt <-
# readr::read_csv(file = "data/Fs_PES_F.sqrt.csv")
# Fs_PES_M.sqrt <-
# readr::read_csv(file = "data/Fs_PES_M.sqrt.csv")
Ec_PES_32m.sqrt <-
readr::read_csv(file = "data/Ec_PES_32m.sqrt.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Ec_PES_25f.sqrt <-
readr::read_csv(file = "data/Ec_PES_25f.sqrt.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
log2-tranformed
Fd_PES.log2 <-
readr::read_csv(file = "data/Fd_PES.log2.csv")
## Rows: 7907 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (9): PS, gamete, E1, E2, E3, E4, E5, E6, matSP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fd_PES_F.log2 <-
# readr::read_csv(file = "data/Fd_PES_F.log2.csv")
# Fd_PES_M.log2 <-
# readr::read_csv(file = "data/Fd_PES_M.log2.csv")
Fs_PES.log2 <-
readr::read_csv(file = "data/Fs_PES.log2.csv")
## Rows: 8291 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (8): PS, gamete, 24H, 48H, 1w, 3w, 4w, matSP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fs_PES_F.log2 <-
# readr::read_csv(file = "data/Fs_PES_F.log2.csv")
# Fs_PES_M.log2 <-
# readr::read_csv(file = "data/Fs_PES_M.log2.csv")
Ec_PES_32m.log2 <-
readr::read_csv(file = "data/Ec_PES_32m.log2.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Ec_PES_25f.log2 <-
readr::read_csv(file = "data/Ec_PES_25f.log2.csv")
## Rows: 11571 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): GeneID
## dbl (10): PS, meiospore, immGA, matGA, oldGA, gamete, earlyPSP, immPSP, matP...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
To avoid
Error in vroom_(file, delim = delim %||% col_types$delim, col_names = col_names, : bad value
, I load the packages later.
library(ggfortify)
library(tximport)
library(myTAI)
IPS_Fs <-
read_delim("data/Interproscan/Fs_ips/F_serratus_pep.filt.processed.faa.tsv",
delim = "\t", col_select = c(1,4,12:14)) %>%
separate_rows(GO_term, sep = "\\|") %>%
dplyr::mutate(GeneID = str_remove(GeneID, "\\.[^.]+$")) %>%
distinct()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 247635 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): GeneID, Analysis, InterPro_accession, InterPro_description, GO_term
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
IPS_Fd <-
read_delim("data/Interproscan/Fd_ips/F_distichus_pep.filt.processed.faa.tsv",
delim = "\t", col_select = c(1,4,12:14)) %>%
separate_rows(GO_term, sep = "\\|") %>%
dplyr::mutate(GeneID = str_remove(GeneID, "\\.[^.]+$")) %>%
distinct()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 161271 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): GeneID, Analysis, InterPro_accession, InterPro_description, GO_term
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
IPS_Ec <-
read_delim("data/Interproscan//Ec_ips/Ectocarpus-sp7.filt.processed.faa.tsv",
delim = "\t", col_select = c(1,4,12:14)) %>%
separate_rows(GO_term, sep = "\\|") %>%
dplyr::mutate(GeneID = str_remove(GeneID, "\\.[^.]+$")) %>%
distinct()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 224135 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): GeneID, Analysis, InterPro_accession, InterPro_description, GO_term
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
IPS_Ec_full <-
read_delim("data/Interproscan//Ec_ips/Ectocarpus-sp7.filt.processed.faa.tsv",
delim = "\t", col_select = c(1,4,12:14)) %>%
separate_rows(GO_term, sep = "\\|") %>%
# dplyr::mutate(GeneID = str_remove(GeneID, "\\.[^.]+$")) %>%
distinct()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 224135 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): GeneID, Analysis, InterPro_accession, InterPro_description, GO_term
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Naive analysis of GO terms for transcription factors.
# ECTOCARPUS
# 248 genes selected
Ec_TF_list <- IPS_Ec %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355|GO:0003700|GO:0140110|GO:0006351|GO:0006351|GO:0051090|GO:0001217|GO:0051091|GO:0043433",GO_term)) %>% select(GeneID) %>% distinct()
# For full gene name (315 genes selected)
Ec_TF_full_list <- IPS_Ec_full %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355|GO:0003700|GO:0140110|GO:0006351|GO:0006351|GO:0051090|GO:0001217|GO:0051091|GO:0043433",GO_term)) %>% select(GeneID) %>% distinct()
# FUCUS
# 269 genes selected
Fs_TF_list <- IPS_Fs %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355|GO:0003700|GO:0140110|GO:0006351|GO:0006351|GO:0051090|GO:0001217|GO:0051091|GO:0043433",GO_term)) %>% select(GeneID) %>% distinct()
# 199 genes selected
Fd_TF_list <- IPS_Fd %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355|GO:0003700|GO:0140110|GO:0006351|GO:0006351|GO:0051090|GO:0001217|GO:0051091|GO:0043433",GO_term)) %>% select(GeneID) %>% distinct()
Fs_TF_list %>%
dplyr::mutate(LRT = GeneID %in% Fs_LRT.1000$GeneID) %>%
ggplot2::ggplot(aes(x = "Putative TFs",fill = LRT)) +
ggplot2::geom_bar(position = position_stack(), colour = "black") +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::theme_classic() +
ggplot2::labs(x = "", title = "Most putative TFs are not part of LRT",
subtitle = "Fucus serratus") +
ggplot2::coord_flip()
Fd_TF_list %>%
dplyr::mutate(LRT = GeneID %in% Fd_LRT.1000$GeneID) %>%
ggplot2::ggplot(aes(x = "Putative TFs",fill = LRT)) +
ggplot2::geom_bar(position = position_stack(), colour = "black") +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::theme_classic() +
ggplot2::labs(x = "", title = "Most putative TFs are not part of LRT",
subtitle = "Fucus distichus") +
ggplot2::coord_flip()
What else may the LRT.1000 genes be?
IPS_Ec %>%
dplyr::filter(GeneID %in% Ec_TF_list$GeneID) %>%
dplyr::select(-c(Analysis, InterPro_accession,GO_term))
IPS_Fs %>%
dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>%
dplyr::select(-c(Analysis, InterPro_accession,GO_term))
IPS_Fd %>%
dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>%
dplyr::select(-c(Analysis, InterPro_accession,GO_term))
Fs_PES %>%
dplyr::select(-c(gamete,matSP)) %>%
myTAI::CollapseReplicates(nrep = 5, FUN = mean, stage.names = "MeanExpression") %>%
dplyr::mutate(PutativeTF = GeneID %in% Fs_TF_list$GeneID) %>%
ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::geom_histogram() +
ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
subtitle = "Fucus serratus") +
ggplot2::scale_x_sqrt() +
ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Fd_PES %>%
dplyr::select(-c(gamete,matSP)) %>%
myTAI::CollapseReplicates(nrep = 6, FUN = mean, stage.names = "MeanExpression") %>%
dplyr::mutate(PutativeTF = GeneID %in% Fd_TF_list$GeneID) %>%
ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::geom_histogram() +
ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
subtitle = "Fucus distichus") +
ggplot2::scale_x_sqrt() +
ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Ec_PES_25f %>%
myTAI::CollapseReplicates(nrep = 9, FUN = mean, stage.names = "MeanExpression") %>%
dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::geom_histogram() +
ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
subtitle = "Ectocarpus female") +
ggplot2::scale_x_sqrt() +
ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Ec_PES_32m %>%
myTAI::CollapseReplicates(nrep = 9, FUN = mean, stage.names = "MeanExpression") %>%
dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::geom_histogram() +
ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
subtitle = "Ectocarpus male") +
ggplot2::scale_x_sqrt() +
ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Naive analysis of GO terms for transcription factors.
This is used in Piasecka et al. 2013
# ECTOCARPUS
# 209 genes selected
Ec_TF_list <- IPS_Ec %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355",GO_term)) %>% select(GeneID) %>% distinct()
# For full gene name (269 genes selected)
Ec_TF_full_list <- IPS_Ec_full %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355",GO_term)) %>% select(GeneID) %>% distinct()
# FUCUS
# 213 genes selected
Fs_TF_list <- IPS_Fs %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355",GO_term)) %>% select(GeneID) %>% distinct()
# 162 genes selected
Fd_TF_list <- IPS_Fd %>% select(GeneID, GO_term) %>% dplyr::filter(grepl("GO:0006355",GO_term)) %>% select(GeneID) %>% distinct()
Fs_TF_list %>%
dplyr::mutate(LRT = GeneID %in% Fs_LRT.1000$GeneID) %>%
ggplot2::ggplot(aes(x = "Putative TFs",fill = LRT)) +
ggplot2::geom_bar(position = position_stack(), colour = "black") +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::theme_classic() +
ggplot2::labs(x = "", title = "Most putative TFs are not part of LRT",
subtitle = "Fucus serratus") +
ggplot2::coord_flip()
Fd_TF_list %>%
dplyr::mutate(LRT = GeneID %in% Fd_LRT.1000$GeneID) %>%
ggplot2::ggplot(aes(x = "Putative TFs",fill = LRT)) +
ggplot2::geom_bar(position = position_stack(), colour = "black") +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::theme_classic() +
ggplot2::labs(x = "", title = "Most putative TFs are not part of LRT",
subtitle = "Fucus distichus") +
ggplot2::coord_flip()
What else may the LRT.1000 genes be?
IPS_Ec %>%
dplyr::filter(GeneID %in% Ec_TF_list$GeneID) %>%
dplyr::select(-c(Analysis, InterPro_accession,GO_term))
IPS_Fs %>%
dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>%
dplyr::select(-c(Analysis, InterPro_accession,GO_term))
IPS_Fd %>%
dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>%
dplyr::select(-c(Analysis, InterPro_accession,GO_term))
Fs_PES %>%
dplyr::select(-c(gamete,matSP)) %>%
myTAI::CollapseReplicates(nrep = 5, FUN = mean, stage.names = "MeanExpression") %>%
dplyr::mutate(PutativeTF = GeneID %in% Fs_TF_list$GeneID) %>%
ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::geom_histogram() +
ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
subtitle = "Fucus serratus") +
ggplot2::scale_x_sqrt() +
ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Fd_PES %>%
dplyr::select(-c(gamete,matSP)) %>%
myTAI::CollapseReplicates(nrep = 6, FUN = mean, stage.names = "MeanExpression") %>%
dplyr::mutate(PutativeTF = GeneID %in% Fd_TF_list$GeneID) %>%
ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::geom_histogram() +
ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
subtitle = "Fucus distichus") +
ggplot2::scale_x_sqrt() +
ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Ec_PES_25f %>%
myTAI::CollapseReplicates(nrep = 9, FUN = mean, stage.names = "MeanExpression") %>%
dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::geom_histogram() +
ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
subtitle = "Ectocarpus female") +
ggplot2::scale_x_sqrt() +
ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Ec_PES_32m %>%
myTAI::CollapseReplicates(nrep = 9, FUN = mean, stage.names = "MeanExpression") %>%
dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
ggplot2::ggplot(aes(MeanExpression, fill = PutativeTF)) +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::geom_histogram() +
ggplot2::labs(title = "Putative TFs expression (both axes sqrt-scaled)",
subtitle = "Ectocarpus male") +
ggplot2::scale_x_sqrt() +
ggplot2::scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This leads to essentially the same results.
Fs_pval_TFs <-
Fs_PES %>%
dplyr::select(-c(gamete,matSP)) %>%
dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>%
myTAI::tfStability(transforms = c("none", "sqrt", "log2", "rank"),
permutations = 1000)
##
## Total runtime of your permutation test: 0.006 seconds.
## Total runtime of your permutation test: 0.006 seconds.
## Total runtime of your permutation test: 0.004 seconds.
## Total runtime of your permutation test: 0.004 seconds.
Fd_pval_TFs <-
Fd_PES %>%
dplyr::select(-c(gamete,matSP)) %>%
dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>%
myTAI::tfStability(transforms = c("none", "sqrt", "log2", "rank"),
permutations = 1000)
##
## Total runtime of your permutation test: 0.016 seconds.
## Total runtime of your permutation test: 0.004 seconds.
## Total runtime of your permutation test: 0.004 seconds.
## Total runtime of your permutation test: 0.004 seconds.
Fs_PES_tS_res <-
Fs_pval_TFs %>%
tibble::as_tibble(rownames = "transformation") %>%
dplyr::rename("pval" = value) %>%
dplyr::mutate(test = "FlatLineTest")
Fd_PES_tS_res <-
Fd_pval_TFs %>%
tibble::as_tibble(rownames = "transformation") %>%
dplyr::rename("pval" = value) %>%
dplyr::mutate(test = "FlatLineTest")
Fs_PES_tS_res %>%
ggplot2::ggplot(
aes(
y = -log10(pval),
x = transformation)) +
ggplot2::geom_col(width = 0.05) +
ggplot2::geom_point(size = 5) +
ggplot2::geom_hline(
yintercept = -log10(0.05),
colour = "#E64B35FF",
linetype = 'dashed',
size = 2
) +
ggplot2::labs(
title = "Fucus serratus",
subtitle = "FlatLineTest",
x = "Transformation"
) +
ggplot2::coord_flip() +
ggplot2::theme_classic()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Fd_PES_tS_res %>%
ggplot2::ggplot(
aes(
y = -log10(pval),
x = transformation)) +
ggplot2::geom_col(width = 0.05) +
ggplot2::geom_point(size = 5) +
ggplot2::geom_hline(
yintercept = -log10(0.05),
colour = "#E64B35FF",
linetype = 'dashed',
size = 2
) +
ggplot2::labs(
title = "Fucus distichus",
subtitle = "FlatLineTest",
x = "Transformation"
) +
ggplot2::coord_flip() +
ggplot2::theme_classic()
Fs_PES %>%
dplyr::select(-c(gamete,matSP)) %>%
dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>%
myTAI::PlotSignature(main = "Fucus distichus",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running 1000 permutations.
##
## Total runtime of your permutation test: 0.006 seconds.
## Significance status of signature: not significant (= no evolutionary signature in the transcriptome).
Fd_PES %>%
dplyr::select(-c(gamete,matSP)) %>%
dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>%
myTAI::PlotSignature(main = "Fucus distichus",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running 1000 permutations.
##
## Total runtime of your permutation test: 0.005 seconds.
## Significance status of signature: not significant (= no evolutionary signature in the transcriptome).
Fs_PES.sqrt %>%
dplyr::select(-c(gamete,matSP)) %>%
dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>%
myTAI::PlotSignature(main = "Fucus distichus",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running 1000 permutations.
##
## Total runtime of your permutation test: 0.005 seconds.
## Significance status of signature: not significant (= no evolutionary signature in the transcriptome).
Fd_PES.sqrt %>%
dplyr::select(-c(gamete,matSP)) %>%
dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>%
myTAI::PlotSignature(main = "Fucus distichus",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running 1000 permutations.
##
## Total runtime of your permutation test: 0.005 seconds.
## Significance status of signature: not significant (= no evolutionary signature in the transcriptome).
Damn… Only 96 genes are retained in F. serratus (~1.2%) and 108 in F. distichus (~1.4%).
The flat line is not such an issue for Ectocarpus
Ec_M_pval_TFs <-
Ec_PES_32m %>%
dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
myTAI::tfStability(transforms = c("none", "sqrt", "log2", "rank"),
permutations = 1000)
##
## Total runtime of your permutation test: 0.006 seconds.
## Total runtime of your permutation test: 0.006 seconds.
## Total runtime of your permutation test: 0.005 seconds.
## Total runtime of your permutation test: 0.005 seconds.
Ec_F_pval_TFs <-
Ec_PES_25f %>%
dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
myTAI::tfStability(transforms = c("none", "sqrt", "log2", "rank"),
permutations = 1000)
##
## Total runtime of your permutation test: 0.005 seconds.
## Total runtime of your permutation test: 0.005 seconds.
## Total runtime of your permutation test: 0.004 seconds.
## Total runtime of your permutation test: 0.005 seconds.
Ec_F_PES_tS_res <-
Ec_F_pval_TFs %>%
tibble::as_tibble(rownames = "transformation") %>%
dplyr::rename("pval" = value) %>%
dplyr::mutate(test = "FlatLineTest")
Ec_M_PES_tS_res <-
Ec_M_pval_TFs %>%
tibble::as_tibble(rownames = "transformation") %>%
dplyr::rename("pval" = value) %>%
dplyr::mutate(test = "FlatLineTest")
Ec_F_PES_tS_res %>%
ggplot2::ggplot(
aes(
y = -log10(pval),
x = transformation)) +
ggplot2::geom_col(width = 0.05) +
ggplot2::geom_point(size = 5) +
ggplot2::geom_hline(
yintercept = -log10(0.05),
colour = "#E64B35FF",
linetype = 'dashed',
size = 2
) +
ggplot2::labs(
title = "Ectocarpus female",
subtitle = "FlatLineTest",
x = "Transformation"
) +
ggplot2::coord_flip() +
ggplot2::theme_classic()
Ec_M_PES_tS_res %>%
ggplot2::ggplot(
aes(
y = -log10(pval),
x = transformation)) +
ggplot2::geom_col(width = 0.05) +
ggplot2::geom_point(size = 5) +
ggplot2::geom_hline(
yintercept = -log10(0.05),
colour = "#E64B35FF",
linetype = 'dashed',
size = 2
) +
ggplot2::labs(
title = "Ectocarpus male",
subtitle = "FlatLineTest",
x = "Transformation"
) +
ggplot2::coord_flip() +
ggplot2::theme_classic()
Ec_PES_32m %>%
dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
myTAI::PlotSignature(main = "Ectocarpus male",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running 1000 permutations.
##
## Total runtime of your permutation test: 0.007 seconds.
## Significance status of signature: significant.
Ec_PES_25f %>%
dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
myTAI::PlotSignature(main = "Ectocarpus female",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running 1000 permutations.
##
## Total runtime of your permutation test: 0.006 seconds.
## Significance status of signature: not significant (= no evolutionary signature in the transcriptome).
Ec_PES_32m.sqrt %>%
dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
myTAI::PlotSignature(main = "Ectocarpus male",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running 1000 permutations.
##
## Total runtime of your permutation test: 0.005 seconds.
## Significance status of signature: significant.
Ec_PES_25f.sqrt %>%
dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
myTAI::PlotSignature(main = "Ectocarpus female",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running 1000 permutations.
##
## Total runtime of your permutation test: 0.005 seconds.
## Significance status of signature: significant.
Ec_PES_32m.log2 %>%
dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
myTAI::PlotSignature(main = "Ectocarpus male",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running 1000 permutations.
##
## Total runtime of your permutation test: 0.005 seconds.
## Significance status of signature: significant.
Ec_PES_25f.log2 %>%
dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID) %>%
myTAI::PlotSignature(main = "Ectocarpus female",measure = "TAI",xlab = "TAI")
## Plot signature: ' TAI ' and test statistic: ' FlatLineTest ' running 1000 permutations.
##
## Total runtime of your permutation test: 0.005 seconds.
## Significance status of signature: significant.
I think these patterns are interesting but the interpretability is undermined by the fact that only ~168 genes are employed here. This less than 5% of genes, woefully ignoring the entire transcriptome.
Fs_PES %>%
dplyr::select(PS,GeneID) %>%
dplyr::mutate(PutativeTF = GeneID %in% Fs_TF_list$GeneID) %>%
ggplot2::ggplot(aes(as.factor(PS), fill = PutativeTF)) +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::geom_bar() +
ggplot2::labs(title = "Putative TFs PS",
subtitle = "Fucus serratus",
x = "PS", y = "Number of genes (log-scaled)") +
ggplot2::scale_y_log10()
Fd_PES %>%
dplyr::select(PS,GeneID) %>%
dplyr::mutate(PutativeTF = GeneID %in% Fd_TF_list$GeneID) %>%
ggplot2::ggplot(aes(as.factor(PS), fill = PutativeTF)) +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::geom_bar() +
ggplot2::labs(title = "Putative TFs PS",
subtitle = "Fucus distichus",
x = "PS", y = "Number of genes (log-scaled)") +
ggplot2::scale_y_log10()
Ec_PES_25f %>%
dplyr::select(PS,GeneID) %>%
dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
ggplot2::ggplot(aes(as.factor(PS), fill = PutativeTF)) +
ggplot2::scale_fill_manual(values = c("#DC0000FF", "white")) +
ggplot2::geom_bar() +
ggplot2::labs(title = "Putative TFs PS",
subtitle = "Ectocarpus",
x = "PS", y = "Number of genes (log-scaled)") +
ggplot2::scale_y_log10()
Putative TFs are distributed across several PS.
Fs_PES %>%
dplyr::select(PS,GeneID) %>%
dplyr::mutate(PutativeTF = GeneID %in% Fs_TF_list$GeneID) %>%
dplyr::group_by(PutativeTF) %>%
dplyr::summarise(mean(PS))
Fs_PES_true <- Fs_PES %>%
dplyr::filter(GeneID %in% Fs_TF_list$GeneID)
Fs_PES_false <- Fs_PES %>%
dplyr::filter(!GeneID %in% Fs_TF_list$GeneID)
ks.test(Fs_PES_true$PS,Fs_PES_false$PS)
## Warning in ks.test.default(Fs_PES_true$PS, Fs_PES_false$PS): p-value will be
## approximate in the presence of ties
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: Fs_PES_true$PS and Fs_PES_false$PS
## D = 0.085279, p-value = 0.4951
## alternative hypothesis: two-sided
Fd_PES %>%
dplyr::select(PS,GeneID) %>%
dplyr::mutate(PutativeTF = GeneID %in% Fd_TF_list$GeneID) %>%
dplyr::group_by(PutativeTF) %>%
dplyr::summarise(mean(PS))
Fd_PES_true <- Fd_PES %>%
dplyr::filter(GeneID %in% Fd_TF_list$GeneID)
Fd_PES_false <- Fd_PES %>%
dplyr::filter(!GeneID %in% Fd_TF_list$GeneID)
ks.test(Fd_PES_true$PS,Fd_PES_false$PS)
## Warning in ks.test.default(Fd_PES_true$PS, Fd_PES_false$PS): p-value will be
## approximate in the presence of ties
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: Fd_PES_true$PS and Fd_PES_false$PS
## D = 0.14818, p-value = 0.01859
## alternative hypothesis: two-sided
Ec_PES_25f %>%
dplyr::select(PS,GeneID) %>%
dplyr::mutate(PutativeTF = GeneID %in% Ec_TF_full_list$GeneID) %>%
dplyr::group_by(PutativeTF) %>%
dplyr::summarise(mean(PS))
Ec_PES_true <- Ec_PES_25f %>%
dplyr::filter(GeneID %in% Ec_TF_full_list$GeneID)
Ec_PES_false <- Ec_PES_25f %>%
dplyr::filter(!GeneID %in% Ec_TF_full_list$GeneID)
ks.test(Ec_PES_true$PS,Ec_PES_false$PS)
## Warning in ks.test.default(Ec_PES_true$PS, Ec_PES_false$PS): p-value will be
## approximate in the presence of ties
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: Ec_PES_true$PS and Ec_PES_false$PS
## D = 0.10315, p-value = 0.059
## alternative hypothesis: two-sided
PS is higher in putative TFs but it is not significantly so.
Since so little genes are considered, I can look at the expression profile of each one! Perhaps the ones with the most variance?
Fs_TF <-
Fs_PES %>%
dplyr::select(-c(gamete,matSP)) %>%
dplyr::filter(GeneID %in% Fs_TF_list$GeneID) %>%
tidyr::pivot_longer(!c(PS,GeneID), names_to = "Stage", values_to = "TPM")
Fs_TF$Stage <- base::factor(Fs_TF$Stage, unique(Fs_TF$Stage))
Fd_TF <-
Fd_PES %>%
dplyr::select(-c(gamete,matSP)) %>%
dplyr::filter(GeneID %in% Fd_TF_list$GeneID) %>%
tidyr::pivot_longer(!c(PS,GeneID), names_to = "Stage", values_to = "TPM")
Fd_TF$Stage <- base::factor(Fd_TF$Stage, unique(Fd_TF$Stage))
We filter for genes with mean TPM below 0.5 and CV (coefficient of variance) over 1. This is fair since we are selecting the genes with the most expression variance.
Fs_TF_highCV <-
Fs_TF %>%
dplyr::group_by(GeneID) %>%
dplyr::summarise(variance = var(TPM),
mean_TPM = mean(TPM)) %>%
dplyr::filter(mean_TPM > 0.5) %>%
dplyr::group_by(GeneID) %>%
dplyr::summarise(CV = sqrt(variance)/mean_TPM) %>%
dplyr::filter(CV > 1)
Fs_TF %>%
dplyr::filter(GeneID %in% Fs_TF_highCV$GeneID) %>%
ggplot2::ggplot(aes(x = Stage,
y = log2(TPM+1),
group = GeneID,
colour = GeneID)) +
ggplot2::geom_line(size = 2, alpha = 0.5) +
ggplot2::labs(title = "Fucus serratus") +
scale_colour_viridis_d()
Fd_TF_highCV <-
Fd_TF %>%
dplyr::group_by(GeneID) %>%
dplyr::summarise(variance = var(TPM),
mean_TPM = mean(TPM)) %>%
dplyr::filter(mean_TPM > 0.5) %>%
dplyr::group_by(GeneID) %>%
dplyr::summarise(CV = sqrt(variance)/mean_TPM) %>%
dplyr::filter(CV > 1)
Fd_TF %>%
dplyr::filter(GeneID %in% Fd_TF_highCV$GeneID) %>%
ggplot2::ggplot(aes(x = Stage,
y = log2(TPM+1),
group = GeneID,
colour = GeneID)) +
ggplot2::geom_line(size = 2, alpha = 0.5) +
ggplot2::labs(title = "Fucus distichus") +
scale_colour_viridis_d()
> Fs_TF_highCV$GeneID
[1] "ser_TRINITY_DN11159_c0_g1" "ser_TRINITY_DN11241_c0_g2"
[3] "ser_TRINITY_DN16240_c0_g2" "ser_TRINITY_DN46003_c0_g1"
> Fd_TF_highCV$GeneID
[1] "dis_TRINITY_DN1365_c0_g3" "dis_TRINITY_DN329_c0_g1"
[3] "dis_TRINITY_DN3687_c0_g2" "dis_TRINITY_DN7838_c0_g3"
[5] "dis_TRINITY_DN8105_c0_g3"
It appears dis_TRINITY_DN329_c0_g1 and
dis_TRINITY_DN8105_c0_g3 share a similar expression profile
to
source("functions/parse_orthogroups.R")
OG_tx2gene <-
parse_orthogroups("data/Results_Nov29/Orthogroups/Orthogroups.tsv") %>%
stack() %>%
tidyr::as_tibble() %>%
dplyr::rename(GeneID = values) %>%
dplyr::rename(OG = ind) %>%
dplyr::relocate(GeneID) %>%
dplyr::mutate(GeneID = str_remove(GeneID, "\\.p[0-9].*"))
> OG_tx2gene %>%
+ dplyr::filter(GeneID %in% c(Fd_TF_highCV$GeneID, Fs_TF_highCV$GeneID))
# A tibble: 8 × 2
GeneID OG
<chr> <fct>
1 dis_TRINITY_DN329_c0_g1 OG0003154
2 dis_TRINITY_DN3687_c0_g2 OG0003246
3 dis_TRINITY_DN3687_c0_g3 OG0003246
4 ser_TRINITY_DN11159_c0_g1 OG0003687
5 dis_TRINITY_DN8105_c0_g3 OG0003946
6 ser_TRINITY_DN16240_c0_g2 OG0005153
7 dis_TRINITY_DN1365_c0_g3 OG0005332
8 ser_TRINITY_DN11241_c0_g2 OG0007518
These genes are not related… It is simply not easy to compare the expression of orthogroup genes.
OG_filt <-
OG_tx2gene %>%
dplyr::filter(GeneID %in% c(Fs_TF_list$GeneID, Fd_TF_list$GeneID))
Fs_abundance_filt <-
readr::read_csv(file = "data/Fs_OG_abundance.csv") %>%
dplyr::filter(OG %in% OG_filt$OG)
## Rows: 5596 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): OG
## dbl (48): Fs_F_gametes_1, Fs_F_gametes_2, Fs_F_gametes_3, Fs_M_gametes_1, Fs...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Fd_abundance_filt <-
readr::read_csv(file = "data/Fd_OG_abundance.csv") %>%
dplyr::filter(OG %in% OG_filt$OG)
## Rows: 5410 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): OG
## dbl (34): Fd_f_gametes_1, Fd_f_gametes_2, Fd_f_gametes_3, Fd_m_gametes_1, Fd...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Many OGs are lost this way. We have 78 OGs in Fd (from 5410) and 77 in Fs (from 5596). By applying inner_join, we obtain just 59 OGs.
sample_info_fd <-
readr::read_csv("data/sample_info_fd.csv")
## Rows: 34 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): SampleName, SeqProt, Species, stage_tissue, Stage, Tissue, Experime...
## dbl (1): Replicate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_info_fs <-
readr::read_csv("data/sample_info_fs.csv")
## Rows: 48 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): SampleName, SeqProt, Species, stage_tissue, Stage, Tissue, Experime...
## dbl (1): Replicate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_info_fd_embryo <-
sample_info_fd %>%
dplyr::filter(Stage %in% c("E1", "E2", "E3", "E4", "E5", "E6")) %>%
dplyr::mutate(LibLab = str_c(Species, "_", Stage, "_", Replicate))
sample_info_fs_embryo <-
sample_info_fs %>%
dplyr::filter(Stage %in% c("24H", "48H", "1w", "3w", "4w")) %>%
dplyr::mutate(Replicate = case_when(
LibName == "Fs_X_zygotes_48h_1" ~ 4,
LibName == "Fs_X_zygotes_48h_2" ~ 5,
LibName == "Fs_X_zygotes_48h_3" ~ 6,
TRUE ~ Replicate
)) %>%
mutate(LibLab = str_c(Species, "_", Stage, "_", Replicate))
selectGenes <- function(counts, min.count=2){
keep <-
counts[rowMeans(counts)>min.count, ]
return(keep)
}
To minimise differences between species, I remove OGs with low counts.
combined_metatable <-
rbind(sample_info_fs_embryo, sample_info_fd_embryo) %>%
relocate(LibName)
merged_TPM_pre <-
merge(Fs_abundance_filt, Fd_abundance_filt, by = "OG") %>%
dplyr::select(1, combined_metatable$LibName)
merged_TPM <-
data.frame(row.names = merged_TPM_pre[,1], merged_TPM_pre[,-1], check.names = FALSE)
merged_TPM.filt <-
as.matrix(merged_TPM) %>%
selectGenes(min.count=10)
10 OGs are lost after the TPM 10 filter (from 59 to 49).
log2_TPM <- log2(merged_TPM.filt+1)
sqrt_TPM <- sqrt(merged_TPM.filt)
tpm10_TPM <- (merged_TPM.filt > 10) * 1
combined_metatable$Stage <- factor(combined_metatable$Stage, levels = unique(combined_metatable$Stage))
myPCAfunction <- function(pcDat, data, x, y, ...){
OUT <-
ggplot2::autoplot(
pcDat,
data = data,
frame.colour = "Stage",
colour = "Stage",
shape = "Species",
x = x,
y = y,
size = 3,
frame = TRUE,
alpha = 0.5,
...)
return(OUT)
}
pcDat <- prcomp(t(log2_TPM), scale = TRUE)
PC1_2 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 2) +
labs(title = "PC1 and PC2") +
theme_grey()
PC1_3 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 3) +
labs(title = "PC1 and PC3") +
theme_grey()
log2_plot <- ggpubr::ggarrange(PC1_2, PC1_3, common.legend = T)
ggpubr::annotate_figure(log2_plot, top = ggpubr::text_grob("log2",
face = "bold", size = 14))
pcDat <- prcomp(t(sqrt_TPM), scale = TRUE)
# pcDat <- prcomp(t(rlog_TPM))
PC1_2 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 2) +
labs(title = "PC1 and PC2") +
theme_grey()
PC1_3 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 3) +
labs(title = "PC1 and PC3") +
theme_grey()
sqrt_plot <- ggpubr::ggarrange(PC1_2, PC1_3, common.legend = T)
ggpubr::annotate_figure(sqrt_plot, top = ggpubr::text_grob("sqrt",
face = "bold", size = 14))
# pcDat <- prcomp(t(tpm10_TPM)) # too many zeros to do a scale = TRUE
#
# PC1_2 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 2) +
# labs(title = "PC1 and PC2") +
# theme_grey()
#
# PC1_3 <- myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 3) +
# labs(title = "PC1 and PC3") +
# theme_grey()
#
# rlog_plot <- ggpubr::ggarrange(PC1_2, PC1_3, common.legend = T)
# ggpubr::annotate_figure(rlog_plot, top = ggpubr::text_grob("tpm10 threshold",
# face = "bold", size = 14))
# with loading
pcDat <- prcomp(t(log2_TPM), scale = TRUE)
myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 2, loadings = TRUE, loadings.colour = 'black', loadings.label.colour="black", loadings.label = TRUE, loadings.label.size = 2, loadings.label.repel=TRUE) +
labs(title = "PC1 and PC2", subtitle = "log2 with loading") +
theme_grey()
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
myPCAfunction(pcDat, data = combined_metatable, x = 1, y = 3, loadings = TRUE, loadings.colour = 'black', loadings.label.colour="black", loadings.label = TRUE, loadings.label.size = 2, loadings.label.repel=TRUE) +
labs(title = "PC1 and PC3", subtitle = "log2 with loading") +
theme_grey()
The general structure of the embryogenesis PCA is maintained when using putative TFs. Unfortunately, the expression level of only 49 OGs are captured here. The TPM10 threshold performs very poorly here so I have removed it.
Here, we use distance metrics to quantify how distant the stages are from each other across Fucus species and maybe even in Ectocarpus.
library(philentropy)
ncol_Fs <- grep( "Fs" , colnames( log2_TPM ) )
ncol_Fd <- grep( "Fd" , colnames( log2_TPM ) )
log2_TPM_renamed <- log2_TPM
base::colnames(log2_TPM_renamed) <- combined_metatable$LibLab
M = cor(x = log2_TPM_renamed[,ncol_Fs],
y = log2_TPM_renamed[,ncol_Fd],
method = "pearson")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = Fd, fill = median_corr)) +
geom_tile() +
geom_text(aes(label = paste0(round(median_corr,3))), color = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_gradient(low = "white", high = "#DC0000FF") +
labs(x = "Fucus serratus samples",
y = "Fucus distichus samples",
title = "Pearson correlation (log2)",
fill = "median(correlation)")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Monotonic relationship
log2_TPM_renamed <- log2_TPM
base::colnames(log2_TPM_renamed) <- combined_metatable$LibLab
M = cor(x = log2_TPM_renamed[,ncol_Fs],
y = log2_TPM_renamed[,ncol_Fd],
method = "spearman")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = Fd, fill = median_corr)) +
geom_tile() +
geom_text(aes(label = paste0(round(median_corr,3))), color = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_gradient(low = "white", high = "#DC0000FF") +
labs(x = "Fucus serratus samples",
y = "Fucus distichus samples",
title = "Spearman correlation (log2)",
fill = "median(correlation)")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
DM <- philentropy::distance(t(log2_TPM_renamed), use.row.names = TRUE, method = "manhattan")
## Metric: 'manhattan'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
M_melt <- reshape2::melt(DM2)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = Fd, fill = median_corr)) +
geom_tile() +
geom_text(aes(label = paste0(round(median_corr,0))), color = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_gradient(low = "white", high = "#3C5488FF") +
labs(x = "Fucus serratus samples",
y = "Fucus distichus samples",
title = "Manhattan distance (log2)",
fill = "median(distance)")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
mat_normalized <- apply(log2_TPM_renamed, 2, function(x) x/sum(x)) %>% as.data.frame()
# colSums(mat_normalized)
DM <- philentropy::distance(t(mat_normalized), use.row.names = TRUE, method = "jensen-shannon")
## Metric: 'jensen-shannon' using unit: 'log'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
DM3 <- sqrt(DM2)
M_melt <- reshape2::melt(DM3)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = Fd, fill = median_corr)) +
geom_tile() +
geom_text(aes(label = paste0(round(median_corr,3))), color = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_gradient(low = "white", high = "#3C5488FF") +
labs(x = "Fucus serratus samples",
y = "Fucus distichus samples",
title = "JSD metric (norm. log2)",
fill = "median(distance)")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
It is only the very early stages that have the most correlated expression putative TF marker. This is seen with the Pearson and Spearman correlations and can be analogously identified with distance-based approaches.
devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.2.2 (2022-10-31)
## os macOS Big Sur ... 10.16
## system x86_64, darwin17.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz Europe/Berlin
## date 2023-12-04
## pandoc 3.1.6.2 @ /usr/local/bin/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-5 2016-07-21 [1] CRAN (R 4.2.0)
## backports 1.4.1 2021-12-13 [1] CRAN (R 4.2.0)
## bit 4.0.5 2022-11-15 [1] CRAN (R 4.2.0)
## bit64 4.0.5 2020-08-30 [1] CRAN (R 4.2.0)
## broom 1.0.5 2023-06-09 [1] CRAN (R 4.2.0)
## bslib 0.5.1 2023-08-11 [1] CRAN (R 4.2.0)
## cachem 1.0.8 2023-05-01 [1] CRAN (R 4.2.0)
## callr 3.7.3 2022-11-02 [1] CRAN (R 4.2.0)
## car 3.1-2 2023-03-30 [1] CRAN (R 4.2.0)
## carData 3.0-5 2022-01-06 [1] CRAN (R 4.2.0)
## cli 3.6.1 2023-03-23 [1] CRAN (R 4.2.0)
## codetools 0.2-19 2023-02-01 [1] CRAN (R 4.2.0)
## colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.2.0)
## cowplot 1.1.1 2020-12-30 [1] CRAN (R 4.2.0)
## crayon 1.5.2 2022-09-29 [1] CRAN (R 4.2.0)
## devtools 2.4.5 2022-10-11 [1] CRAN (R 4.2.0)
## digest 0.6.33 2023-07-07 [1] CRAN (R 4.2.0)
## dplyr * 1.1.3 2023-09-03 [1] CRAN (R 4.2.0)
## ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.0)
## evaluate 0.22 2023-09-29 [1] CRAN (R 4.2.2)
## fansi 1.0.5 2023-10-08 [1] CRAN (R 4.2.2)
## farver 2.1.1 2022-07-06 [1] CRAN (R 4.2.0)
## fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.2.0)
## fitdistrplus 1.1-11 2023-04-25 [1] CRAN (R 4.2.0)
## forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.2.0)
## foreach 1.5.2 2022-02-02 [1] CRAN (R 4.2.0)
## fs 1.6.3 2023-07-20 [1] CRAN (R 4.2.0)
## generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.0)
## ggfortify * 0.4.16 2023-03-20 [1] CRAN (R 4.2.0)
## ggplot2 * 3.4.4 2023-10-12 [1] CRAN (R 4.2.2)
## ggpubr 0.6.0 2023-02-10 [1] CRAN (R 4.2.0)
## ggrepel 0.9.4 2023-10-13 [1] CRAN (R 4.2.2)
## ggsignif 0.6.4 2022-10-13 [1] CRAN (R 4.2.0)
## glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0)
## gridExtra 2.3 2017-09-09 [1] CRAN (R 4.2.0)
## gtable 0.3.4 2023-08-21 [1] CRAN (R 4.2.0)
## hms 1.1.3 2023-03-21 [1] CRAN (R 4.2.0)
## htmltools 0.5.6.1 2023-10-06 [1] CRAN (R 4.2.2)
## htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.2.0)
## httpuv 1.6.11 2023-05-11 [1] CRAN (R 4.2.2)
## iterators 1.0.14 2022-02-05 [1] CRAN (R 4.2.0)
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.2.0)
## jsonlite 1.8.7 2023-06-29 [1] CRAN (R 4.2.0)
## knitr 1.44 2023-09-11 [1] CRAN (R 4.2.0)
## labeling 0.4.3 2023-08-29 [1] CRAN (R 4.2.0)
## later 1.3.1 2023-05-02 [1] CRAN (R 4.2.2)
## lattice 0.21-9 2023-10-01 [1] CRAN (R 4.2.2)
## lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.0)
## lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.2.0)
## magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0)
## MASS 7.3-60 2023-05-04 [1] CRAN (R 4.2.2)
## Matrix 1.5-4.1 2023-05-18 [1] CRAN (R 4.2.0)
## memoise 2.0.1 2021-11-26 [1] CRAN (R 4.2.0)
## mime 0.12 2021-09-28 [1] CRAN (R 4.2.0)
## miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.2.0)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0)
## myTAI * 1.0.1.9000 2023-10-03 [1] Github (drostlab/myTAI@e159136)
## philentropy * 0.7.0 2022-11-05 [1] CRAN (R 4.2.0)
## pillar 1.9.0 2023-03-22 [1] CRAN (R 4.2.0)
## pkgbuild 1.4.2 2023-06-26 [1] CRAN (R 4.2.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0)
## pkgload 1.3.3 2023-09-22 [1] CRAN (R 4.2.0)
## plyr 1.8.9 2023-10-02 [1] CRAN (R 4.2.2)
## prettyunits 1.2.0 2023-09-24 [1] CRAN (R 4.2.0)
## processx 3.8.2 2023-06-30 [1] CRAN (R 4.2.0)
## profvis 0.3.8 2023-05-02 [1] CRAN (R 4.2.0)
## promises 1.2.1 2023-08-10 [1] CRAN (R 4.2.2)
## ps 1.7.5 2023-04-18 [1] CRAN (R 4.2.0)
## purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.2.2)
## R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0)
## Rcpp 1.0.11 2023-07-06 [1] CRAN (R 4.2.0)
## readr * 2.1.4 2023-02-10 [1] CRAN (R 4.2.0)
## remotes 2.4.2.1 2023-07-18 [1] CRAN (R 4.2.2)
## reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.2.0)
## rlang 1.1.1 2023-04-28 [1] CRAN (R 4.2.0)
## rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.2.2)
## rstatix 0.7.2 2023-02-01 [1] CRAN (R 4.2.0)
## rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.2.0)
## sass 0.4.7 2023-07-15 [1] CRAN (R 4.2.0)
## scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.0)
## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0)
## shiny 1.7.5.1 2023-10-14 [1] CRAN (R 4.2.2)
## stringi 1.7.12 2023-01-11 [1] CRAN (R 4.2.0)
## stringr * 1.5.0 2022-12-02 [1] CRAN (R 4.2.0)
## survival 3.5-7 2023-08-14 [1] CRAN (R 4.2.0)
## tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.2.0)
## tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.2.0)
## tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.0)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.2.0)
## timechange 0.2.0 2023-01-11 [1] CRAN (R 4.2.0)
## tximport * 1.26.1 2022-12-16 [1] Bioconductor
## tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.2.2)
## urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.2.0)
## usethis 2.2.2 2023-07-06 [1] CRAN (R 4.2.0)
## utf8 1.2.3 2023-01-31 [1] CRAN (R 4.2.0)
## vctrs 0.6.4 2023-10-12 [1] CRAN (R 4.2.2)
## viridisLite 0.4.2 2023-05-02 [1] CRAN (R 4.2.0)
## vroom 1.6.4 2023-10-02 [1] CRAN (R 4.2.2)
## withr 2.5.1 2023-09-26 [1] CRAN (R 4.2.0)
## xfun 0.40 2023-08-09 [1] CRAN (R 4.2.2)
## xtable 1.8-4 2019-04-21 [1] CRAN (R 4.2.0)
## yaml 2.3.7 2023-01-23 [1] CRAN (R 4.2.0)
##
## [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
##
## ──────────────────────────────────────────────────────────────────────────────